{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 根据device_ip统计C_site_id和C_app_id的用户规模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import sys\n",
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "\n",
    "site_id_users = defaultdict(set)\n",
    "app_id_users = defaultdict(set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1m\n",
      "2m\n",
      "3m\n",
      "4m\n"
     ]
    }
   ],
   "source": [
    "tr_path = 'data/test_FE.csv'\n",
    "with open(tr_path, 'r') as csv_file:\n",
    "    for i, row in enumerate(csv.DictReader(csv_file), start=1):\n",
    "        site_id_users[row['C_site_id']].add(row['device_ip'])\n",
    "        app_id_users[row['C_app_id']].add(row['device_ip'])\n",
    "        if i % 1000000 == 0:\n",
    "            sys.stderr.write('{0}m\\n'.format(int(i / 1000000)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1m\n",
      "2m\n",
      "3m\n",
      "4m\n",
      "5m\n",
      "6m\n",
      "7m\n",
      "8m\n",
      "9m\n",
      "10m\n",
      "11m\n",
      "12m\n",
      "13m\n",
      "14m\n",
      "15m\n",
      "16m\n",
      "17m\n",
      "18m\n",
      "19m\n",
      "20m\n",
      "21m\n",
      "22m\n",
      "23m\n",
      "24m\n",
      "25m\n",
      "26m\n",
      "27m\n",
      "28m\n",
      "29m\n",
      "30m\n",
      "31m\n",
      "32m\n",
      "33m\n",
      "34m\n",
      "35m\n",
      "36m\n",
      "37m\n",
      "38m\n",
      "39m\n",
      "40m\n"
     ]
    }
   ],
   "source": [
    "ts_path = 'data/train_FE.csv'\n",
    "with open(ts_path, 'r') as csv_file:\n",
    "    for i, row in enumerate(csv.DictReader(csv_file), start=1):\n",
    "        site_id_users[row['C_site_id']].add(row['device_ip'])\n",
    "        app_id_users[row['C_app_id']].add(row['device_ip'])\n",
    "        if i % 1000000 == 0:\n",
    "            sys.stderr.write('{0}m\\n'.format(int(i / 1000000)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "app_id_dict = pd.Series()\n",
    "site_id_dict = pd.Series()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for item in app_id_users:\n",
    "    app_id_dict[item] = int(np.log10(len(app_id_users[item])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "for item in site_id_users:\n",
    "    site_id_dict[item] = int(np.log10(len(site_id_users[item])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "app_id_dict = app_id_dict.sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "site_id_dict = site_id_dict.sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "json.dump(app_id_dict.to_dict(), open(\"data/app_id_users_dict.json\", \"w\"))\n",
    "json.dump(site_id_dict.to_dict(), open(\"data/site_id_users_dict.json\", \"w\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
